Final Implementation of the Project

Exploratory Data Analaysis

# Importing all the required libraries
library(tidyverse)
library(ggplot2)
library(plotly)
library(ipumsr)
library(haven)
library(splitstackshape)
library(caret)
library(randomForest)
# A. Loading the data and creating a dataframe
df_ddi <- read_ipums_ddi("Dataset/nhis_00001.xml")
df <- as.data.frame(read_ipums_micro(df_ddi, verbose = FALSE))
df %>% head(5)
##   YEAR SERIAL STRATA PSU        NHISHID REGION URBRRL PERNUM          NHISPID
## 1 2020      1    150  25 0002020H000002      3      2      1 0002020H00000210
## 2 2020      2    111  10 0002020H000003      4      3      1 0002020H00000310
## 3 2020      3    133   3 0002020H000004      2      2      1 0002020H00000410
## 4 2020      4    139  45 0002020H000007      3      3      1 0002020H00000710
## 5 2020      5    130  21 0002020H000009      3      1      1 0002020H00000910
##       HHX SAMPWEIGHT LONGWEIGHT PARTWEIGHT ASTATFLG CSTATFLG AGE SEX SEXORIEN
## 1 H000002   5946.002   17605.50      0.000        1        0  56   2        2
## 2 H000003   6288.726       0.00   5853.664        1        0  66   1        2
## 3 H000004   6083.271       0.00   5814.397        1        0  59   2        2
## 4 H000007  11306.962       0.00  10626.550        1        0  56   1        2
## 5 H000009   6471.818   19317.18      0.000        1        0  48   2        2
##   MARSTAT MARST FAMSIZE PARTNEREMP FAMKIDNO RACEA ARMFEV EDUC SPOUSEDUC
## 1      10    11       3          0        1   100     11  501         3
## 2      10    11       2          0        0   100     11  501        10
## 3      10    11       2          0        0   100     11  502         9
## 4      30    30       1          0        0   100     11  103         0
## 5      99    99       4          0        1   100     98  400         0
##   SCHOOLNOW EMPSTAT HOURSWRK PAIDSICK EMPHI EMPFT INCFAM07ON FAMTOTINC USUALPL
## 1         1     200        0        0     0     0         11     27000       2
## 2         1     100       40        2     2     2         24    250000       3
## 3         1     100       40        1     1     2         24    150000       2
## 4         1     999        0        0     0     0         22     54000       2
## 5         8     999        0        0     0     0         23     95000       2
##   HINOTCOVE CVDDIAG CVDTEST CVDTESTRSLT
## 1         1       1       1           0
## 2         1       0       0           0
## 3         1       0       0           0
## 4         1       0       0           0
## 5         2       1       1           0
# B. Preliminary check on the data
#  a. Review classes
sapply(df, class)
## $YEAR
## [1] "numeric"
## 
## $SERIAL
## [1] "numeric"
## 
## $STRATA
## [1] "haven_labelled" "vctrs_vctr"     "double"        
## 
## $PSU
## [1] "haven_labelled" "vctrs_vctr"     "double"        
## 
## $NHISHID
## [1] "character"
## 
## $REGION
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $URBRRL
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $PERNUM
## [1] "numeric"
## 
## $NHISPID
## [1] "character"
## 
## $HHX
## [1] "character"
## 
## $SAMPWEIGHT
## [1] "numeric"
## 
## $LONGWEIGHT
## [1] "numeric"
## 
## $PARTWEIGHT
## [1] "numeric"
## 
## $ASTATFLG
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $CSTATFLG
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $AGE
## [1] "haven_labelled" "vctrs_vctr"     "double"        
## 
## $SEX
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $SEXORIEN
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $MARSTAT
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $MARST
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $FAMSIZE
## [1] "haven_labelled" "vctrs_vctr"     "double"        
## 
## $PARTNEREMP
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $FAMKIDNO
## [1] "haven_labelled" "vctrs_vctr"     "double"        
## 
## $RACEA
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $ARMFEV
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $EDUC
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $SPOUSEDUC
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $SCHOOLNOW
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $EMPSTAT
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $HOURSWRK
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $PAIDSICK
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $EMPHI
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $EMPFT
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $INCFAM07ON
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $FAMTOTINC
## [1] "haven_labelled" "vctrs_vctr"     "double"        
## 
## $USUALPL
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $HINOTCOVE
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $CVDDIAG
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $CVDTEST
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $CVDTESTRSLT
## [1] "haven_labelled" "vctrs_vctr"     "integer"
# Note: some of the variables have attached data definitions "haven labeled"

# b. Check for missing values
sapply(df, function(x) sum(is.na(x)))
##        YEAR      SERIAL      STRATA         PSU     NHISHID      REGION 
##           0           0           0           0           0           0 
##      URBRRL      PERNUM     NHISPID         HHX  SAMPWEIGHT  LONGWEIGHT 
##           0           0           0           0           0           0 
##  PARTWEIGHT    ASTATFLG    CSTATFLG         AGE         SEX    SEXORIEN 
##           0           0           0           0           0           0 
##     MARSTAT       MARST     FAMSIZE  PARTNEREMP    FAMKIDNO       RACEA 
##           0           0           0           0           0           0 
##      ARMFEV        EDUC   SPOUSEDUC   SCHOOLNOW     EMPSTAT    HOURSWRK 
##           0           0           0           0           0           0 
##    PAIDSICK       EMPHI       EMPFT  INCFAM07ON   FAMTOTINC     USUALPL 
##           0           0           0           0           0           0 
##   HINOTCOVE     CVDDIAG     CVDTEST CVDTESTRSLT 
##           0           0           0           0
#c. Dropping variables that are not that important or provide no inisghts in the data
droppable_cols <- c("YEAR","SERIAL","STRATA","PSU","NHISHID","NHISPID","HHX","SAMPWEIGHT","LONGWEIGHT","PARTWEIGHT")
df <- df %>% dplyr::select(-all_of(droppable_cols))
colnames(df)
##  [1] "REGION"      "URBRRL"      "PERNUM"      "ASTATFLG"    "CSTATFLG"   
##  [6] "AGE"         "SEX"         "SEXORIEN"    "MARSTAT"     "MARST"      
## [11] "FAMSIZE"     "PARTNEREMP"  "FAMKIDNO"    "RACEA"       "ARMFEV"     
## [16] "EDUC"        "SPOUSEDUC"   "SCHOOLNOW"   "EMPSTAT"     "HOURSWRK"   
## [21] "PAIDSICK"    "EMPHI"       "EMPFT"       "INCFAM07ON"  "FAMTOTINC"  
## [26] "USUALPL"     "HINOTCOVE"   "CVDDIAG"     "CVDTEST"     "CVDTESTRSLT"
# Feature Engineering variables 
# REGION, URBRRL PERNUM ASTATFLG CSTATFLG AGE FAMKIDNO dont need any feature engineering yet

# a. Handling SEX AND SEXORIEN variable
# Combining all 'unknown categories' into one
unk <- c(7,8,9)
df$SEX[df$SEX %in% unk] = 9

# Combining Unknown SEXORIEN into "Something else" category
unk <- c(5,7,8)
df$SEXORIEN[df$SEXORIEN %in% unk] = 4

# b. Handling RACEA Variable
# Combining all 'unknown categories' into one
unk <- c(580, 900, 970, 980, 990)
df$RACEA[df$RACEA %in% unk] = 900

#c. Handling MARSTAT & MARST Variable
# Since both of the variables are indicative of same features it makes sense to keep on their current marital status for this analysis.
df <- df %>% select(-MARSTAT)

# Combining NIU and Unknown into one and combining all married labels into one
unk <- c(0,99)
df$MARST[df$MARST %in% unk] = 99

marr <- c(10,11,12,13)
df$MARST[df$MARST %in% marr] = 10

# d. Handling FAMSIZE 
# Combine Unknowns into one
unk <- c(98,99)
df$FAMSIZE[df$FAMSIZE %in% unk] = 98

# e. Handling PARTNEREMP
unk <- c(7,8,9)
df$PARTNEREMP[df$PARTNEREMP %in% unk] = 9

# f. Handling ARMFEV by combining all unknowns
unk <- c(97,98,99)
df$ARMFEV[df$ARMFEV %in% unk] = 99

# g. Handling EDUC, SPOUSEDUC, SCHOOLNOW, by combining all unknowns
unk <- c(996,997,998,999)
df$EDUC[df$EDUC %in% unk] = 996

unk <- c(97,98,99)
df$SPOUSEDUC[df$SPOUSEDUC %in% unk] = 99

unk <- c(7,8,9)
df$SCHOOLNOW[df$SCHOOLNOW %in% unk] = 9

#h. Handling Employment Status  
# Table below shows the distribution of employment categories
#Labels:
#value                                            label
#     0                                              NIU
#   100                                         Employed
#   110                                          Working
#   111                  Working for pay at job/business
#   112              Working, w/out pay, at job/business
#   120                        With job, but not at work
#   121 With job, not at work: not laid-off, not looking
#   122                   With job, not at work: looking
#   200                                     Not employed
#   210                                       Unemployed
#   211                            Unemployed: On layoff
#   212                Unemployed: On layoff and looking
#   213           Unemployed: Unk if looking or laid off
#   214                 Unemployed: Looking or on layoff
#   215                Unemployed: Have job to return to
#   216             Unemployed: Had job during the round
#   217       Unemployed: No job during reference period
#   220                               Not in labor force
#   900                               Unknown-all causes
#   997                                  Unknown-refused
#   998                          Unknown-not ascertained
#   999                               Unknown-don't know

# Combining all working into one, with job into one, Unemployed into one and unknown into one
work <- c(110,111,112)
df$EMPSTAT[df$EMPSTAT %in% work] = 110
w_job <- c(120,121,122)
df$EMPSTAT[df$EMPSTAT %in% w_job] = 120
unemployed <- c(200,210,211:217)
df$EMPSTAT[df$EMPSTAT %in% unemployed] = 200
unk <- c(997:999)
df$EMPSTAT[df$EMPSTAT %in% unk] = 999

# i. Handling HOURSWRK by replacing number of hours unknown into 0
unk <- c(0,97:99)
df$HOURSWRK[df$HOURSWRK %in% unk] = 0

# j. Handling all variables remaining 
# Combined unknowns into one
df <- df %>% mutate(PAIDSICK = replace(PAIDSICK,PAIDSICK >4,9))  

# Mutating EMPHI
df <- df %>% mutate(EMPHI = replace(EMPHI,EMPHI >4,9))  

# Mutating USUALPL
# value                         label
#     0                           NIU
#     1       There is no place or No
#     2 Yes, has a usual place or Yes
#     3  There is more than one place
#     7               Unknown-refused
#     8       Unknown-not ascertained
#     9            Unknown-don't know

# 2-3 combined as 2 and 7,8,9 combined as 3

df <- df %>% 
    mutate(USUALPL = replace(USUALPL, USUALPL == 3,2))  

df <- df %>% 
    mutate(USUALPL = replace(USUALPL, USUALPL >= 7,9))  

# Mutating HINOTCOVE
# Combine all unknowns into one
df <- df %>% mutate(HINOTCOVE = replace(HINOTCOVE,HINOTCOVE >4,9))  

# Mutating CVDTEST
# Same transformation as above
df <- df %>% mutate(CVDTEST = replace(CVDTEST,CVDTEST >4,9))  

# Mutating CVDDIAG
# Same transformation as above
df <- df %>% mutate(CVDDIAG = replace(CVDDIAG,CVDDIAG >4,9))  

# Mutating CVDTESTRSLTS
#Labels:
# value                   label
#     0                     NIU
#     1                      No
#     2                     Yes
#     3 Did not receive results
#     7         Unknown-refused
#     8 Unknown-not ascertained
#     9      Unknown-don't know

#Similar transformation where we combine all unknowns to 9
df <- df %>% mutate(CVDTESTRSLT = replace(CVDTESTRSLT,CVDTESTRSLT >4,9))  

Data Visualization on the clean data

# A. Checking for health insurance coverage and covid relation
counts_hinc <- table(df$HINOTCOVE, df$CVDTESTRSLT)

rownames(counts_hinc) <- c('Has coverage','No coverage','Unknown')
colnames(counts_hinc) <- c('NIU', 'Negative','Positive',"Did Not Receive results",'Unknown')

barplot(counts_hinc[,2:3], col = rainbow(3:6),
        main = 'Test Results based on Insurance Coverage', legend = rownames(counts_hinc)[1:2], xlab = 'COVID Test Result')

# B. Seeing the number of people who got tested given they have health insurance coverage from the company
sample_df <- df %>% group_by(CVDTEST = as.factor(CVDTEST),EMPHI = as.factor(EMPHI)) %>% dplyr::summarize(count = n())
## `summarise()` has grouped output by 'CVDTEST'. You can override using the `.groups` argument.
ggplot(sample_df, aes(fill=CVDTEST, y=count, x=CVDTEST)) + 
    geom_bar(position="dodge", stat="identity") +
    ggtitle("People who tested when they get Health insurance coverage from the Company") +
    facet_wrap(~EMPHI, ncol = 4, labeller = labeller(CVDTEST = c("0" = "NIU","1" =  "Not Covered by Company", "2" = "Covered by Company", "9" = "Unknown" ))) +
    theme(legend.position="none") +
    xlab("Covid Test") + ylab("Frequency") + scale_x_discrete(labels = c("NIU","Negative","Positive","No Results")) + scale_fill_manual(values = c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C")) + theme_minimal() + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 30))

# C. Seeing results between getting sick leave and covid result tests
counts_ps <- table(df$PAIDSICK, df$CVDTESTRSLT)
rownames(counts_ps) <- c('NIU','No sick leave','Sick leave', 'Unknown')
colnames(counts_ps) <- c('NIU', 'Negative','Positive', "Did Not Receive Results",'Unknown')


barplot(counts_ps[2:3,2:3], col = rainbow(2), 
        legend = rownames(counts_ps)[2:3], main = 'Test Results based on having Sick Leave', xlab = 'Testing Opportunity')

# D. Does having a usual place for medical care affect the testing
sample_df <- df %>% filter(USUALPL == 2)
sample_df <- sample_df %>% group_by(CVDTEST) %>% summarise(n = n())
sample_df %>% head()
## # A tibble: 4 x 2
##                  CVDTEST     n
##                <int+lbl> <int>
## 1 0 [NIU]                16418
## 2 1 [No]                 12956
## 3 2 [Yes]                 5236
## 4 9 [Unknown-don't know]    33
fig <- plot_ly(sample_df, x = ~CVDTEST, y = ~n, type = 'bar',
        marker = list(color = c('rgba(204,204,204,1)', 'rgba(204,204,204,1)', 'rgba(222,45,38,0.8)','rgba(204,204,204,1)')))

fig <- fig %>% layout(title = "Seeing if having Usual Place of medical care encourages Covid Testing",
         xaxis = list(title = ""),
         yaxis = list(title = ""))

fig

Data Modelling

Question 1

How does socioeconomic status affect COVID-19 infection status?

# A. Creating Stratified Sample
table(df$CVDDIAG)
## 
##     0     1     2     9 
## 17649 18954   671    84
sample_df <- df %>% filter(CVDDIAG !=0)
sample_df <- sample_df %>% filter(CVDDIAG !=9)

strat <- stratified(sample_df, group = 'CVDDIAG', size = 2000)
## Groups 2 contain fewer rows than requested. Returning all rows.
# Dropping the NIUs & Unknown as the provide no insights into the data
strat$CVDDIAG[strat$CVDDIAG == 1] = 0
strat$CVDDIAG[strat$CVDDIAG == 2] = 1

# Seeing the distribution of the dataset
table(strat$CVDDIAG)/length(strat$CVDDIAG)
## 
##         0         1 
## 0.7487832 0.2512168
# Visualising it
barplot(table(strat$CVDDIAG))

# Splitting the data into training and testing
set.seed(101)
idx = sample.int(n = nrow(strat), size = floor(0.70 * nrow(strat)), replace = F)

s.train <- strat[idx,]
s.test <- strat[-idx,]
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
# Selecting personal information variables
per_info_vars <- c("AGE", "SEX", "MARST", "RACEA", "SEXORIEN", "EDUC", "ASTATFLG", "CSTATFLG","EDUC","REGION","CVDDIAG")

data_to_study.train <- s.train %>% dplyr::select(per_info_vars)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(per_info_vars)` instead of `per_info_vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
data_to_study.test   <- s.test %>% dplyr::select(per_info_vars)
    
# Fitting model using stepAIC to find the optimal combination of variables
model <-  glm(data = data_to_study.train, CVDDIAG ~ ., family = 'binomial') %>% stepAIC(trace = FALSE, k=2)

# Printing the summary
summary(model)
## 
## Call:
## glm(formula = CVDDIAG ~ AGE + RACEA + ASTATFLG, family = "binomial", 
##     data = data_to_study.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1330  -0.7770  -0.6897  -0.4923   4.8696  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.9427903  0.2391516  -8.124 4.52e-16 ***
## AGE         -0.0113351  0.0031403  -3.610 0.000307 ***
## RACEA        0.0008603  0.0002546   3.379 0.000727 ***
## ASTATFLG     1.3012420  0.2700237   4.819 1.44e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2079.2  on 1868  degrees of freedom
## Residual deviance: 2034.4  on 1865  degrees of freedom
## AIC: 2042.4
## 
## Number of Fisher Scoring iterations: 5
# Checking for the confusion matrix
lr_probs <- predict(model, newdata = data_to_study.test, type = 'response')
lr_predicted <- ifelse(lr_probs < 0.5, 0, 1)
confusionMatrix(factor(lr_predicted, levels=min(data_to_study.test$CVDDIAG):max(data_to_study.test$CVDDIAG)), factor(data_to_study.test$CVDDIAG, levels=min(data_to_study.test$CVDDIAG):max(data_to_study.test$CVDDIAG)))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 588 214
##          1   0   0
##                                           
##                Accuracy : 0.7332          
##                  95% CI : (0.7011, 0.7635)
##     No Information Rate : 0.7332          
##     P-Value [Acc > NIR] : 0.5184          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.7332          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.7332          
##          Detection Rate : 0.7332          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
## 
plot(model)

m1 <- glm(data = data_to_study.train, CVDDIAG ~ AGE + RACEA + SEX + SEXORIEN + MARST + REGION + EDUC, family = 'binomial')

summary(m1)
## 
## Call:
## glm(formula = CVDDIAG ~ AGE + RACEA + SEX + SEXORIEN + MARST + 
##     REGION + EDUC, family = "binomial", data = data_to_study.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3057  -0.7649  -0.6875  -0.5501   4.4858  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.6350381  0.3490563  -4.684 2.81e-06 ***
## AGE         -0.0093570  0.0032519  -2.877 0.004010 ** 
## RACEA        0.0009083  0.0002589   3.508 0.000452 ***
## SEX          0.0933984  0.1099770   0.849 0.395740    
## SEXORIEN     0.3217312  0.0951322   3.382 0.000720 ***
## MARST       -0.0018908  0.0024325  -0.777 0.436985    
## REGION      -0.0083507  0.0530949  -0.157 0.875025    
## EDUC         0.0003948  0.0004548   0.868 0.385434    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2079.2  on 1868  degrees of freedom
## Residual deviance: 2039.3  on 1861  degrees of freedom
## AIC: 2055.3
## 
## Number of Fisher Scoring iterations: 5
# Checking for the confusion matrix
lr_probs <- predict(m1, newdata = data_to_study.test, type = 'response')
lr_predicted <- ifelse(lr_probs < 0.5, 0, 1)
cf <- confusionMatrix(factor(lr_predicted, levels=min(data_to_study.test$CVDDIAG):max(data_to_study.test$CVDDIAG)), factor(data_to_study.test$CVDDIAG, levels=min(data_to_study.test$CVDDIAG):max(data_to_study.test$CVDDIAG)))
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
conf.mat <- as.data.frame.matrix(cf$table)
stargazer(conf.mat, title = "Confusion Matrix", summary = FALSE)
## 
## % Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
## % Date and time: Wed, Dec 08, 2021 - 17:59:02
## \begin{table}[!htbp] \centering 
##   \caption{Confusion Matrix} 
##   \label{} 
## \begin{tabular}{@{\extracolsep{5pt}} ccc} 
## \\[-1.8ex]\hline 
## \hline \\[-1.8ex] 
##  & 0 & 1 \\ 
## \hline \\[-1.8ex] 
## 0 & $587$ & $213$ \\ 
## 1 & $1$ & $1$ \\ 
## \hline \\[-1.8ex] 
## \end{tabular} 
## \end{table}

Question 2

What are the driving factors behind infections?

# A. Using logistic regression to see what affects the diagnosis and selecting the best model using stepAIC

library(randomForestExplainer)
## Warning: package 'randomForestExplainer' was built under R version 4.1.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
# Creating samples

x.train <- s.train %>% dplyr::select(-CVDDIAG)
y.train <- s.train %>% dplyr::select(CVDDIAG)

x.test <- s.test %>% dplyr::select(-CVDDIAG)
y.test <- s.test %>% dplyr::select(CVDDIAG)

model.rf <- randomForest::randomForest(x = x.train, y = as.factor(y.train$CVDDIAG), ntree = 500, importance = T, proximity = T)
print(model.rf)
## 
## Call:
##  randomForest(x = x.train, y = as.factor(y.train$CVDDIAG), ntree = 500,      importance = T, proximity = T) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 7.44%
## Confusion matrix:
##      0   1 class.error
## 0 1396  16  0.01133144
## 1  123 334  0.26914661
print(importance(model.rf,2))
##             MeanDecreaseGini
## REGION            15.1776486
## URBRRL            15.5461451
## PERNUM             0.4920045
## ASTATFLG           0.2890717
## CSTATFLG           0.4990005
## AGE               39.5814821
## SEX                6.7601661
## SEXORIEN           7.3789954
## MARST             10.9652030
## FAMSIZE           14.4032490
## PARTNEREMP         3.5806953
## FAMKIDNO           9.5932683
## RACEA             11.7607230
## ARMFEV             4.6114876
## EDUC              19.6018951
## SPOUSEDUC         12.6240833
## SCHOOLNOW          4.1628201
## EMPSTAT            2.7761303
## HOURSWRK          20.6186549
## PAIDSICK           6.1470726
## EMPHI              6.2977804
## EMPFT              4.6548641
## INCFAM07ON        12.7385126
## FAMTOTINC         38.5097475
## USUALPL            2.4717670
## HINOTCOVE          3.0837250
## CVDTEST           86.6101415
## CVDTESTRSLT      310.7068821
plot(model.rf)

predictions <- predict(model.rf, newdata = x.test)
confusionMatrix(predictions,droplevels(as.factor(y.test$CVDDIAG)))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 580  56
##          1   8 158
##                                          
##                Accuracy : 0.9202         
##                  95% CI : (0.8992, 0.938)
##     No Information Rate : 0.7332         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.7804         
##                                          
##  Mcnemar's Test P-Value : 4.228e-09      
##                                          
##             Sensitivity : 0.9864         
##             Specificity : 0.7383         
##          Pos Pred Value : 0.9119         
##          Neg Pred Value : 0.9518         
##              Prevalence : 0.7332         
##          Detection Rate : 0.7232         
##    Detection Prevalence : 0.7930         
##       Balanced Accuracy : 0.8624         
##                                          
##        'Positive' Class : 0              
## 
frame <- measure_importance(model.rf)
frame
##       variable mean_min_depth no_of_nodes accuracy_decrease gini_decrease
## 1          AGE        2.58000       11141      1.176346e-02    39.5814821
## 2       ARMFEV        5.25932        2004      6.923569e-04     4.6114876
## 3     ASTATFLG       10.13896         149      2.853428e-04     0.2890717
## 4     CSTATFLG        9.93798         157      2.608123e-04     0.4990005
## 5      CVDTEST        2.68674        2157      4.987778e-02    86.6101415
## 6  CVDTESTRSLT        1.83000        5205      1.667199e-01   310.7068821
## 7         EDUC        3.59200        7439      5.184685e-03    19.6018951
## 8        EMPFT        5.82216        1590      4.602736e-03     4.6548641
## 9        EMPHI        5.21142        1888      4.163632e-03     6.2977804
## 10     EMPSTAT        6.20004        1035      1.828076e-03     2.7761303
## 11    FAMKIDNO        4.26200        3830      6.560714e-03     9.5932683
## 12     FAMSIZE        3.66200        5969      6.278135e-03    14.4032490
## 13   FAMTOTINC        2.81600       11199      9.298352e-03    38.5097475
## 14   HINOTCOVE        6.35080        1256      3.905990e-04     3.0837250
## 15    HOURSWRK        3.15200        5547      1.014233e-02    20.6186549
## 16  INCFAM07ON        4.13200        5176      4.226255e-03    12.7385126
## 17       MARST        4.16000        4580      5.607950e-03    10.9652030
## 18    PAIDSICK        5.17654        2052      3.661607e-03     6.1470726
## 19  PARTNEREMP        6.27538        1257      2.188198e-04     3.5806953
## 20      PERNUM        9.42906         282      6.436061e-04     0.4920045
## 21       RACEA        3.65400        3713      1.653747e-03    11.7607230
## 22      REGION        3.86200        6369     -8.313457e-04    15.1776486
## 23   SCHOOLNOW        5.21952        1543      1.332205e-03     4.1628201
## 24         SEX        4.99322        3477      6.147641e-05     6.7601661
## 25    SEXORIEN        4.10322        2220      2.951308e-03     7.3789954
## 26   SPOUSEDUC        4.26600        4781      3.998279e-03    12.6240833
## 27      URBRRL        3.55600        6247      1.933427e-03    15.5461451
## 28     USUALPL        7.11012        1041      5.671243e-05     2.4717670
##    no_of_trees times_a_root       p_value
## 1          500           33  0.000000e+00
## 2          494            5  1.000000e+00
## 3          132            5  1.000000e+00
## 4          141            6  1.000000e+00
## 5          483           69  1.000000e+00
## 6          500           83 3.141836e-127
## 7          500            6  0.000000e+00
## 8          472           31  1.000000e+00
## 9          489           49  1.000000e+00
## 10         418           18  1.000000e+00
## 11         500            0  9.723098e-03
## 12         500            2 3.471439e-271
## 13         500            5  0.000000e+00
## 14         460            3  1.000000e+00
## 15         500           69 9.891712e-186
## 16         500            0 9.879519e-123
## 17         500            9  2.623109e-47
## 18         493           44  1.000000e+00
## 19         471            1  1.000000e+00
## 20         227            1  1.000000e+00
## 21         500           22  3.486345e-01
## 22         500            1  0.000000e+00
## 23         484           12  1.000000e+00
## 24         499            0  9.998416e-01
## 25         499           15  1.000000e+00
## 26         500            3  5.066819e-69
## 27         500            8  0.000000e+00
## 28         454            0  1.000000e+00
plot_importance_ggpairs(frame)

plot_importance_rankings(frame)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

imp <- as.data.frame(model.rf$importance[,3:4])

imp <- imp %>% arrange(desc(MeanDecreaseGini))

ggplot(imp, aes(x = reorder(rownames(imp), MeanDecreaseGini), y = MeanDecreaseGini, fill = rownames(imp))) + geom_bar(stat = "identity") + ggtitle("Variable Importance based on MeanDecreaseGini") + theme_minimal() + coord_flip() + xlab("Variables") + ylab("Importance of the variables")

Question 3

What type of working conditions impacts the ability to get tested for COVID?